001 /* 002 * NeedlemanWunsch.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.alignment; 033 034 import java.io.Reader; 035 import java.io.IOException; 036 037 /** 038 * This class implements the classic global alignment algorithm (with linear gap penalty 039 * function) due to S.B.Needleman and C.D.Wunsch (1970). 040 * 041 * <P>It is based on a dynamic programming approach. The idea consists of, given two 042 * sequences A and B of sizes n and m, respectively, building an (n+1 x m+1) matrix M that 043 * contains the similarity of prefixes of A and B. Every position M[i,j] in the matrix 044 * holds the score between the subsequences A[1..i] and B[1..j]. The first row and column 045 * represent alignments with spaces.</P> 046 * 047 * <P>Starting from row 0, column 0, the algorithm computes each position M[i,j] with the 048 * following recurrence:</P> 049 * 050 * <CODE><BLOCKQUOTE><PRE> 051 * M[0,0] = 0 052 * M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]), 053 * M[i-1,j-1] + scoreSubstitution (A[i], B[j]), 054 * M[i-1,j] + scoreDeletion(A[i]) } 055 * </PRE></BLOCKQUOTE></CODE> 056 * 057 * <P>In the end, the value at the last position (last row, last column) will contain 058 * the similarity between the two sequences. This part of the algorithm is accomplished 059 * by the {@link #computeMatrix computeMatrix} method. It has quadratic space complexity 060 * since it needs to keep an (n+1 x m+1) matrix in memory. And since the work of computing 061 * each cell is constant, it also has quadratic time complexity.</P> 062 * 063 * <P>After the matrix has been computed, the alignment can be retrieved by tracing a path 064 * back in the matrix from the last position to the first. This step is performed by 065 * the {@link #buildOptimalAlignment buildOptimalAlignment} method, and since the path can 066 * be roughly as long as (m + n), this method has O(n) time complexity.</P> 067 * 068 * <P>If the similarity value only is needed (and not the alignment itself), it is easy to 069 * reduce the space requirement to O(n) by keeping just the last row or column in memory. 070 * This is precisely what is done by the {@link #computeScore computeScore} method. Note 071 * that it still requires O(n<SUP>2</SUP>) time.</P> 072 * 073 * <P>For a more efficient approach to the global alignment problem, see the 074 * {@linkplain CrochemoreLandauZivUkelson} algorithm. For local alignment, see the 075 * {@linkplain SmithWaterman} algorithm.</P> 076 * 077 * @author Sergio A. de Carvalho Jr. 078 * @see SmithWaterman 079 * @see CrochemoreLandauZivUkelson 080 * @see CrochemoreLandauZivUkelsonLocalAlignment 081 * @see CrochemoreLandauZivUkelsonGlobalAlignment 082 */ 083 public class NeedlemanWunsch extends PairwiseAlignmentAlgorithm 084 { 085 /** 086 * The first sequence of an alignment. 087 */ 088 protected CharSequence seq1; 089 090 /** 091 * The second sequence of an alignment. 092 */ 093 protected CharSequence seq2; 094 095 /** 096 * The dynamic programming matrix. Each position (i, j) represents the best score 097 * between the firsts i characters of <CODE>seq1</CODE> and j characters of 098 * <CODE>seq2</CODE>. 099 */ 100 protected int[][] matrix; 101 102 /** 103 * Loads sequences into {@linkplain CharSequence} instances. In case of any error, 104 * an exception is raised by the constructor of <CODE>CharSequence</CODE> (please 105 * check the specification of that class for specific requirements). 106 * 107 * @param input1 Input for first sequence 108 * @param input2 Input for second sequence 109 * @throws IOException If an I/O error occurs when reading the sequences 110 * @throws InvalidSequenceException If the sequences are not valid 111 * @see CharSequence 112 */ 113 protected void loadSequencesInternal (Reader input1, Reader input2) 114 throws IOException, InvalidSequenceException 115 { 116 // load sequences into instances of CharSequence 117 this.seq1 = new CharSequence(input1); 118 this.seq2 = new CharSequence(input2); 119 } 120 121 /** 122 * Frees pointers to loaded sequences and the dynamic programming matrix so that their 123 * data can be garbage collected. 124 */ 125 protected void unloadSequencesInternal () 126 { 127 this.seq1 = null; 128 this.seq2 = null; 129 this.matrix = null; 130 } 131 132 /** 133 * Builds an optimal global alignment between the loaded sequences after computing the 134 * dynamic programming matrix. It calls the <CODE>buildOptimalAlignment</CODE> method 135 * after the <CODE>computeMatrix</CODE> method computes the dynamic programming 136 * matrix. 137 * 138 * @return an optimal global alignment between the loaded sequences 139 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 140 * with the loaded sequences. 141 * @see #computeMatrix 142 * @see #buildOptimalAlignment 143 */ 144 protected PairwiseAlignment computePairwiseAlignment () 145 throws IncompatibleScoringSchemeException 146 { 147 // compute the matrix 148 computeMatrix (); 149 150 // build and return an optimal global alignment 151 PairwiseAlignment alignment = buildOptimalAlignment (); 152 153 // allow the matrix to be garbage collected 154 matrix = null; 155 156 return alignment; 157 } 158 159 /** 160 * Computes the dynamic programming matrix. 161 * 162 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 163 * with the loaded sequences. 164 */ 165 protected void computeMatrix () throws IncompatibleScoringSchemeException 166 { 167 int r, c, rows, cols, ins, del, sub; 168 169 rows = seq1.length()+1; 170 cols = seq2.length()+1; 171 172 matrix = new int [rows][cols]; 173 174 // initiate first row 175 matrix[0][0] = 0; 176 for (c = 1; c < cols; c++) 177 matrix[0][c] = matrix[0][c-1] + scoreInsertion(seq2.charAt(c)); 178 179 // calculates the similarity matrix (row-wise) 180 for (r = 1; r < rows; r++) 181 { 182 // initiate first column 183 matrix[r][0] = matrix[r-1][0] + scoreDeletion(seq1.charAt(r)); 184 185 for (c = 1; c < cols; c++) 186 { 187 ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c)); 188 sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c)); 189 del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r)); 190 191 // choose the greatest 192 matrix[r][c] = max (ins, sub, del); 193 } 194 } 195 } 196 197 /** 198 * Builds an optimal global alignment between the loaded sequences. Before it is 199 * executed, the dynamic programming matrix must already have been computed by 200 * the <CODE>computeMatrix</CODE> method. 201 * 202 * @return an optimal global alignment between the loaded sequences 203 * @throws IncompatibleScoringSchemeException If the scoring scheme 204 * is not compatible with the loaded sequences. 205 * @see #computeMatrix 206 */ 207 protected PairwiseAlignment buildOptimalAlignment () 208 throws IncompatibleScoringSchemeException 209 { 210 StringBuffer gapped_seq1, score_tag_line, gapped_seq2; 211 int r, c, sub, max_score; 212 213 gapped_seq1 = new StringBuffer(); 214 score_tag_line = new StringBuffer(); 215 gapped_seq2 = new StringBuffer(); 216 217 // start at the last row, last column 218 r = matrix.length - 1; 219 c = matrix[r].length - 1; 220 max_score = matrix[r][c]; 221 222 while ((r > 0) || (c > 0)) 223 { 224 if (c > 0) 225 if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c))) 226 { 227 // insertion was used 228 gapped_seq1.insert (0, GAP_CHARACTER); 229 score_tag_line.insert (0, GAP_TAG); 230 gapped_seq2.insert (0, seq2.charAt(c)); 231 c = c - 1; 232 233 // skip to the next iteration 234 continue; 235 } 236 237 if ((r > 0) && (c > 0)) 238 { 239 sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); 240 241 if (matrix[r][c] == matrix[r-1][c-1] + sub) 242 { 243 // substitution was used 244 gapped_seq1.insert (0, seq1.charAt(r)); 245 if (seq1.charAt(r) == seq2.charAt(c)) 246 if (useMatchTag()) 247 score_tag_line.insert (0, MATCH_TAG); 248 else 249 score_tag_line.insert (0, seq1.charAt(r)); 250 else if (sub > 0) 251 score_tag_line.insert (0, APPROXIMATE_MATCH_TAG); 252 else 253 score_tag_line.insert (0, MISMATCH_TAG); 254 gapped_seq2.insert (0, seq2.charAt(c)); 255 r = r - 1; c = c - 1; 256 257 // skip to the next iteration 258 continue; 259 } 260 } 261 262 // must be a deletion 263 gapped_seq1.insert (0, seq1.charAt(r)); 264 score_tag_line.insert (0, GAP_TAG); 265 gapped_seq2.insert (0, GAP_CHARACTER); 266 r = r - 1; 267 } 268 269 return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(), 270 gapped_seq2.toString(), max_score); 271 } 272 273 /** 274 * Computes the score of the best global alignment between the two sequences using the 275 * scoring scheme previously set. This method calculates the similarity value only 276 * (doesn't build the whole matrix so the alignment cannot be recovered, however it 277 * has the advantage of requiring O(n) space only). 278 * 279 * @return score of the best global alignment between the loaded sequences 280 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 281 * with the loaded sequences. 282 */ 283 protected int computeScore () throws IncompatibleScoringSchemeException 284 { 285 int[] array; 286 int r, c, rows, cols, tmp, ins, del, sub; 287 288 rows = seq1.length()+1; 289 cols = seq2.length()+1; 290 291 if (rows <= cols) 292 { 293 // goes columnwise 294 array = new int [rows]; 295 296 // initiate first column 297 array[0] = 0; 298 for (r = 1; r < rows; r++) 299 array[r] = array[r-1] + scoreDeletion(seq1.charAt(r)); 300 301 // calculate the similarity matrix (keep current column only) 302 for (c = 1; c < cols; c++) 303 { 304 // initiate first row (tmp hold values 305 // that will be later moved to the array) 306 tmp = array[0] + scoreInsertion(seq2.charAt(c)); 307 308 for (r = 1; r < rows; r++) 309 { 310 ins = array[r] + scoreInsertion(seq2.charAt(c)); 311 sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); 312 del = tmp + scoreDeletion(seq1.charAt(r)); 313 314 // move the temp value to the array 315 array[r-1] = tmp; 316 317 // choose the greatest 318 tmp = max (ins, sub, del); 319 } 320 321 // move the temp value to the array 322 array[rows - 1] = tmp; 323 } 324 325 return array[rows - 1]; 326 } 327 else 328 { 329 // goes rowwise 330 array = new int [cols]; 331 332 // initiate first row 333 array[0] = 0; 334 for (c = 1; c < cols; c++) 335 array[c] = array[c-1] + scoreInsertion(seq2.charAt(c)); 336 337 // calculate the similarity matrix (keep current row only) 338 for (r = 1; r < rows; r++) 339 { 340 // initiate first column (tmp hold values 341 // that will be later moved to the array) 342 tmp = array[0] + scoreDeletion(seq1.charAt(r)); 343 344 for (c = 1; c < cols; c++) 345 { 346 ins = tmp + scoreInsertion(seq2.charAt(c)); 347 sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); 348 del = array[c] + scoreDeletion(seq1.charAt(r)); 349 350 // move the temp value to the array 351 array[c-1] = tmp; 352 353 // choose the greatest 354 tmp = max (ins, sub, del); 355 } 356 357 // move the temp value to the array 358 array[cols - 1] = tmp; 359 } 360 361 return array[cols - 1]; 362 } 363 } 364 }